import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
path = "sentiment_analysis/data/"
train_data_path ='sentiment_analysis/data/training_data/train.csv'
def readData(path):
# read dictionary into df
df_dict_data = pd.read_table(path + 'dictionary.txt')
df_dict_prepared = df_dict_data['!|0'].str.split('|', expand=True).rename(columns={0: 'Phrase', 1: 'phrase_ids'})
# read sentiment labels into df
df_labels = pd.read_table(path + 'sentiment_labels.txt')
df_labels_prepared = df_labels['phrase ids|sentiment values'].str.split('|', expand=True).rename(columns={0: 'phrase_ids', 1: 'sentiment_values'})
#combine data frames containing sentence and sentiment
df_all_prepared = df_dict_prepared.merge(df_labels_prepared, how='inner', on='phrase_ids')
return df_all_prepared
df_all_prepared = readData(path)
training_data = pd.read_csv(train_data_path, encoding='iso-8859-1')
training_data.head()
# list(training_data['Phrase'][i] if 0.1 < training_data['sentiment_values'][i] < .2 else 1 for i in range(len(training_data)))
fig,ax = plt.subplots(1,1)
training_data.hist(column = 'sentiment_values', ax = ax)
ax.set_title('Number of entries classified by sentiment analysis')
ax.set_xticks([i/10 for i in range(0,10)])
print('Percent of entries in range [0.5, 0.6] = %.3f %%'%(100*(sum(list(filter(lambda x: 0.5 < x < 0.6, training_data['sentiment_values'])))/len(training_data))))
First we read in the embeddings file into a dictionary - each entry is a word, followed by the vector of numbers to represent its values
embeddings_index = {}
f = open(path + '/glovo/glove_6B/glove.6B.300d.txt')
for line in f:
values = line.split(' ')
word = values[0] ## The first entry is the word
coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
embeddings_index[word] = coefs
f.close()
print('GloVe data loaded')
# example of the word representation in terms of vector
embeddings_index[','][:5]
import re
def clearWords(lines_without_stopwords, training_data, col_name="Phrase"):
for line in training_data[col_name].values:
line = line.lower()
line_by_words = re.findall(r'(?:\w+)', line, flags = re.UNICODE)
line_no_stps = list(filter(lambda word: word not in stop, line_by_words))
lines_without_stopwords.append(line_no_stps)
lines_without_stopwords=[]
clearWords(lines_without_stopwords, training_data)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
MAX_NUM_WORDS = 1000
MAX_SEQUENCE_LENGTH = 100
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(lines_without_stopwords)
sequences = tokenizer.texts_to_sequences(lines_without_stopwords) # set numbers to the words
word_index = tokenizer.word_index # create index for each word
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) # zero matrix
labels = to_categorical(np.asarray([round(i*10) if round(i*10) != 10 else 9 for i in training_data['sentiment_values']]), num_classes=10, dtype='float32')
print(data.shape)
print(labels.shape)
# prepare embedding matrix
from keras.layers import Embedding
from keras.initializers import Constant
## EMBEDDING_DIM = ## seems to need to match the embeddings_index dimension
EMBEDDING_DIM = embeddings_index.get('a').shape[0]
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
if i > MAX_NUM_WORDS:
continue
embedding_vector = embeddings_index.get(word)
## This references the loaded embeddings dictionary
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
EMBEDDING_DIM,
embeddings_initializer=Constant(embedding_matrix),
input_length=MAX_SEQUENCE_LENGTH,
trainable=False)
print(embedding_matrix.shape)
plt.plot(embedding_matrix[word_index.get('probably')]) # 0.5 - sent analysis - blue
plt.plot(embedding_matrix[word_index.get('best')]) # 0.9-1 - sent analysis - orange
# plt.plot(embedding_matrix[word_index.get('invited')]) # 0.1-0.2 - sent analysis - green
plt.title('example vectors')
# print(word_index.get('invited'))
# print(embeddings_index['invited'][:5])
# print(embedding_matrix[6128][:5])
## To create and visualize a model
from keras.models import Sequential
from keras.layers import Dense, Bidirectional, Dropout, LSTM
model = Sequential()
model.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix], input_length=100, trainable=False))
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.50))
model.add(Dense(10, activation='softmax'))
# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())
print(labels.shape)
model.fit(data, np.array(labels), validation_split=0.1, epochs = 1)
from sklearn.manifold import TSNE
## Get weights
embds = model.layers[0].get_weights()[0]
## Plotting function
## Visualize words in two dimensions
tsne_embds = TSNE(n_components=2).fit_transform(embds)
plt.plot(tsne_embds[:,0],tsne_embds[:,1],'.')
# model.save(path + 'my_model.h5')
from keras.models import load_model
model = load_model(path + 'my_model.h5')
designed the model to provide a sentiment score between 0 to 1 with 0 being very negative and 1 being very positive. This was done by building a multi-class classification model i.e 10 class, one class for each decile.
def clearByDividerWords(lines_without_stopwords, training_data, col_name="Phrase", div):
for line in training_data[col_name].values:
for div_part in line.split(div):
div_part = div_part.lower()
div_part_by_words = re.findall(r'(?:\w+)', sent, flags = re.UNICODE)
div_part_no_stps = list(filter(lambda word: word not in stop, div_part_by_words))
div_part_without_stopwords.append(div_part_no_stps)
def generate_essay_data_sentiment_matrix(filename):
project_data_path = 'essayVSreview/'
opened_filename = pd.read_csv(project_data_path + filename, sep='\t')
opened_filename.head()
opened_filename_without_stops = []
# whole essay
clearWords(opened_filename_without_stops, opened_filename, 'ReviewText')
sequences_opened_filename = tokenizer.texts_to_sequences(opened_filename_without_stops)
data_opened_filename = pad_sequences(sequences_opened_filename, maxlen=MAX_SEQUENCE_LENGTH) # zero matrix
pred_essay = model.predict(data_opened_filename)
return pred_essay
pred_essay_gay_merriage = generate_essay_data_sentiment_matrix('GayMarriage_400.csv')
pred_essay_gun_control = generate_essay_data_sentiment_matrix('GunControl_400.csv')
pred_essay_review_amt = generate_essay_data_sentiment_matrix('ReviewAMT_500_t.csv')
# sentiment analysis of each essay in gay_merriage
def show_each_essay_sentiment(pred, file, task, task_desc):
for i in range(task, 1600, 4):
plt.plot(pred[i])
plt.title('sentiment analysis of each essay in ' + file + ": " + task_desc)
plt.xlabel('sentiment class')
plt.ylabel('power value')
plt.show()
copy1, copy2, true, fake = 0, 1, 2, 3
copy_1_desc, copy_2_desc, true_desc, fake_desc = "Copy_1", "Copy2", "True Essay", "Fake Essay"
show_each_essay_sentiment(pred_essay_gay_merriage, 'Gay Meriage', copy1, copy_1_desc)
show_each_essay_sentiment(pred_essay_gay_merriage, 'Gay Meriage', copy2, copy_2_desc)
show_each_essay_sentiment(pred_essay_gay_merriage, 'Gay Meriage', true, true_desc)
show_each_essay_sentiment(pred_essay_gay_merriage, 'Gay Meriage', fake, fake_desc)
show_each_essay_sentiment(pred_essay_gun_control, 'Gun Control', copy1, copy_1_desc)
show_each_essay_sentiment(pred_essay_gun_control, 'Gun Control', copy2, copy_2_desc)
show_each_essay_sentiment(pred_essay_gun_control, 'Gun Control', true, true_desc)
show_each_essay_sentiment(pred_essay_gun_control, 'Gun Control', fake, fake_desc)
show_each_essay_sentiment(pred_essay_review_amt, 'Review AMT', copy2, copy_2_desc)
show_each_essay_sentiment(pred_essay_review_amt, 'Review AMT', copy1, copy_1_desc)
show_each_essay_sentiment(pred_essay_review_amt, 'Review AMT', true, true_desc)
show_each_essay_sentiment(pred_essay_review_amt, 'Review AMT', fake, fake_desc)
For each file we have as positive thoughts as negative.
For review esaays it's much esaier to write narrow-minded sentances, so we can see no doubts in the midle of the plots.
def clearByDividerWords (lines_without_stopwords, training_data, div, col_name="Phrase"):
# by row
for line in training_data[col_name].values:
div_part_without_stopwords = []
for div_part in line.split(div):
div_part = " ".join([i.lower() for i in div_part])
div_part_by_words = re.findall(r'(?:\w+)', div_part, flags = re.UNICODE)
div_part_no_stps = list(filter(lambda word: word not in stop, div_part_by_words))
div_part_without_stopwords.append(div_part_no_stps)
lines_without_stopwords.append(div_part_without_stopwords)
def generate_sent_data_sentiment_matrix(filename):
project_data_path = 'essayVSreview/'
opened_filename = pd.read_csv(project_data_path + filename, sep='\t')
opened_filename.head()
opened_filename_without_stops = []
pred_essay = []
clearByDividerWords(opened_filename_without_stops, gay_merriage, '.', 'ReviewText')
for sent_without_stops in opened_filename_without_stops:
sequences_sent = tokenizer.texts_to_sequences(sent_without_stops)
sent_matrix = pad_sequences(sequences_sent, maxlen=MAX_SEQUENCE_LENGTH) # zero matrix
pred_essay.append(model.predict(sent_matrix))
return pred_essay
pred_sent_gay_merriage = generate_sent_data_sentiment_matrix('GayMarriage_400.csv')
pred_sent_gun_control = generate_sent_data_sentiment_matrix('GunControl_400.csv')
pred_sent_review_amt = generate_sent_data_sentiment_matrix('ReviewAMT_500_t.csv')
# sentiment analysis of each sentance in review_amt first essay
plt.figure()
for i in range(len(pred_sent_review_amt[:5])):
for j in range(len(pred_sent_review_amt[i])):
plt.plot(pred_sent_review_amt[i][j])
plt.title('sentiment analysis of each sentance in review_amt first essay')
plt.xlabel('sentiment class')
plt.ylabel('power value')
plt.show()
Each sentance consist of a big amount of neutral words.
from collections import defaultdict
proj_data_path = 'essayVSreview/'
gay_merriage = pd.DataFrame(pd.read_csv(proj_data_path +"GayMarriage_400.csv", sep='\t'))
gun_control = pd.DataFrame(pd.read_csv(proj_data_path + "GunControl_400.csv", sep='\t'))
review_amt = pd.DataFrame(pd.read_csv(proj_data_path +"ReviewAMT_500_t.csv", sep='\t'))
files = [gay_merriage, gun_control, review_amt]
files_names = ['gay_merriage', 'gun_control', 'review_amt']
SPACE = 32
PERIOD = 190
def find_average_writing_pause(file, filename, divider, task):
'''
return: list(dict(hundrets: num_of_cases))
'''
res = []
f_ind = file.index
for id_ in range(f_ind.start, f_ind.stop):
if not (file['Task'][id_] == task):
continue
row_meta = file['ReviewMeta'][id_].split(';')
row_timestamp = []
row_keycode = []
for sep_meta in row_meta:
if len(sep_meta.split()) < 3:
continue
row_int_timestamp = int(sep_meta.split()[0])
row_int_keycode = int(sep_meta.split()[2])
row_timestamp.append(row_int_timestamp)
row_keycode.append(row_int_keycode)
timestamp_diff = []
first = 0
for timestamp_i in range(1, len(row_timestamp)):
# pauses between words
# if we remove this line we'll get letters distribution
if row_keycode[timestamp_i] == divider:
timestamp_diff.append(row_timestamp[timestamp_i] - row_timestamp[first])
first = timestamp_i
dict_ = defaultdict(int)
for i in timestamp_diff:
if not (0 < i < 250):
continue
dict_[round(i/10) * 10] += 1
res.append(dict_)
return res
def draw_plot(divider, task, title):
# plot of the biggest number of cases
fig, axs = plt.subplots(3, figsize=[18.5, 10.5])
fig.suptitle(title)
for i in range(len(files)):
file = files[i]
if i == 2 and task == 'True Essay':
task = 'True Review'
if i == 2 and task == 'Fake Essay':
task = 'Fake Review'
list_of_essay_pauses = find_average_writing_pause(file, files_names[i], divider, task)
for dict_ in list_of_essay_pauses:
try:
lists = sorted(dict_.items())
x, y = zip(*lists)
axs[i].plot(x, y)
axs[i].set(ylabel='number of pause cases in ' + files_names[i])
except ValueError:
continue
for ax in axs.flat:
ax.set(xlabel='timestamp pauses btw words')
# True essays words distribution
draw_plot(SPACE, 'True Essay', 'Data word-pauses distribution in TRUE essay:')
# Fake essays words distribution
draw_plot(SPACE, 'Fake Essay', 'Data word-pauses distribution in FAKE essay:')
# Copy True essays words distribution
draw_plot(SPACE, 'Copy_1', 'Data word-pauses distribution in Copy of TRUE essay:')
# Copy Fake essays words distribution
draw_plot(SPACE, 'Copy_1', 'Data word-pauses distribution in Copy of FAKE essay:')
# TODO: draw density function
# True essays sentance distribution
draw_plot(PERIOD, 'True Essay', 'Data sentance-pauses distribution in TRUE essay:')
# Fake essays sentance distribution
draw_plot(PERIOD, 'Fake Essay', 'Data sentance-pauses distribution in FAKE essay:')
# Copy True essays sentance distribution
draw_plot(PERIOD, 'Copy_1', 'Data sentance-pauses distribution in Copy of TRUE essay:')
# Copy Fake essays sentance distribution
draw_plot(PERIOD, 'Copy_2', 'Data sentance-pauses distribution in Copy of FAKE essay:')
# TODO: draw density function
As we can see the most average pause:
PERIOD = '190'
ARROWS = [37, 38, 39, 40]
BACKSPACE = 8
SENT_AV_TOP_LIM = 125
SENT_AV_BOTTOM_LIM = 25
def analise_sentance_keystroke(file):
'''
list of dictionaries with each file (by sentance) analysis
return: list(dict('trueEssay': dict('MouseUp', 'Freq'...), 'falseEssay': dict('MouseUp', 'Freq'...)))
'''
f_indx = file.index
sentence_keystroke_true = []
sentence_keystroke_false = []
each_sent_res = []
last_timestamp = 0
i_end = 0
# each row-essay in a file
for id in range(f_indx.start, 1600):
key_down_freq = []
keystroke_meta_data = gay_merriage['ReviewMeta'][id].split(';')
i_start = i_end
if i_start >= len(keystroke_meta_data):
continue
row_meta = keystroke_meta_data[i_start]
if file['Task'][id] in ['Copy_1', 'True Essay']:
sent_keystroke_data = sentence_keystroke_true
else:
sent_keystroke_data = sentence_keystroke_false
while len(row_meta.split()) >= 2 and row_meta.split()[2] != PERIOD:
i_end += 1
row_meta = keystroke_meta_data[i_end]
for sent_event in keystroke_meta_data[i_start:i_end]:
sent_keystroke_data_dict = defaultdict(int)
meta_sent_parser = sent_event.split()
event = meta_sent_parser[1]
# collect KeyStroke date out of the sentence
cur_timestamp = int(meta_sent_parser[0])
timestamp_diff = cur_timestamp - last_timestamp
# choose only 'special' data
if timestamp_diff > SENT_AV_TOP_LIM or \
timestamp_diff < SENT_AV_BOTTOM_LIM:
key_down_freq.append(timestamp_diff)
last_timestamp = cur_timestamp
if event == 'MouseUp':
sent_keystroke_data_dict['mouse_select'] += int(meta_sent_parser[3]) - int(meta_sent_parser[2])
elif int(meta_sent_parser[2]) in ARROWS:
sent_keystroke_data_dict['arrows'] += 1
elif int(meta_sent_parser[2]) == BACKSPACE:
sent_keystroke_data_dict['del'] += 1
sent_keystroke_data_dict['key_down_freq'] = key_down_freq
# list of each sentance analysises
sent_keystroke_data.append(sent_keystroke_data_dict)
essay_res_with_list_of_sent = {'trueEssay': sentence_keystroke_true, 'falseEssay': sentence_keystroke_false}
each_sent_res.append(essay_res_with_list_of_sent)
return each_sent_res
dict_of_gayMeraige_sent_keystroke = analise_sentance_keystroke(files[0])
dict_of_gunControl_sent_keystroke = analise_sentance_keystroke(files[1])
dict_of_reviewAMT_sent_keystroke = analise_sentance_keystroke(files[2])
len(dict_of_gayMeraige_sent_keystroke)
WORDS_AV_TOP_LIM = 0
WORDS_AV_BOTTOM_LIM = 150
def analise_essay_keystroke(file):
file_ind = file.index
dict_true = defaultdict(int)
dict_false = defaultdict(int)
key_down_freq = []
last_timestamp = 0
res= []
for id in range(file_ind.start, file_ind.stop, file_ind.step):
dict_to_write = dict_true if file['Task'][id] in ['Copy_1', 'True Essay'] else dict_false
keystroke_meta_data = file['ReviewMeta'][id].split(';')
for i in range(len(keystroke_meta_data)):
meta_essay_parser = keystroke_meta_data[i].split()
if (len(meta_essay_parser) < 2):
continue
event = meta_essay_parser[1]
# collect KeyStroke date out of the sentence
cur_timestamp = int(meta_essay_parser[0])
timestamp_diff = cur_timestamp - last_timestamp
# choose only 'special' data
if timestamp_diff > WORDS_AV_TOP_LIM or \
timestamp_diff < WORDS_AV_BOTTOM_LIM:
if timestamp_diff < 0:
cur_timestamp = last_timestamp
timestamp_diff = cur_timestamp - last_timestamp
key_down_freq.append(timestamp_diff)
last_timestamp = cur_timestamp
if event == 'MouseUp':
dict_to_write['mouse_select'] += int(meta_essay_parser[3]) - int(meta_essay_parser[2])
elif int(meta_essay_parser[2]) in ARROWS:
dict_to_write['arrows'] += 1
elif int(meta_essay_parser[2]) == BACKSPACE:
dict_to_write['del'] += 1
dict_to_write['key_down_freq'] = key_down_freq
dict_ = defaultdict(int)
for i in timestamp_diff:
if not (0 < i < 250):
continue
dict_[round(i/10) * 10] += 1
res.append(dict_)
res.append({'trueEssay': dict_true, 'falseEssay': dict_false})
return res
# One file parsed data keystroke analysis
dict_of_gayMeriage_file_keystroke = analise_essay_keystroke(files[0])
dict_of_gunControl_file_keystroke = analise_essay_keystroke(files[1])
dict_of_reviewAMT_file_keystroke = analise_essay_keystroke(files[2])
key_down_freq = []
max(sorted(list(filter(lambda x: x < 1000, dict_of_reviewAMT_file_keystroke[0]['trueEssay']['key_down_freq']))))
# for i in range(1600):
# key_down_freq = dict_of_gayMeriage_file_keystroke[i]['trueEssay']['key_down_freq']
TODO:
Next steps:
from collections import defaultdict
PERIOD = '190'
sentance_keystroke_corespondence = defaultdict(int)
file = gay_merriage
for sentence in file['ReviewText'].split('.'):
sentance_keystroke_corespondence[sentence] =
len(gay_merriage['ReviewText'][0].split('.'))